1 Setup and Loading Libraries

knitr::opts_chunk$set(echo = TRUE, include = TRUE,fig.align = "center")
options(scipen = 999, knitr.kable.NA = "")

# Installing libraries (if do not have)
##install.packages("tidyverse")
##install.packages("lubridate")
##install.packages("readxl")
##install.packages("skimr")
##install.packages("magrittr")
##install.packages("tidyquant")
##install.packages("tsibble")
##install.packages("feasts")
##install.packages("ggcorrplot")
##install.packages("glmnet")
##install.packages("caret")
##install.packages("rattle")

# Importing libraries
library(tidyverse)
library(lubridate)
library(readxl)
library(skimr)
library(magrittr)
library(tidyquant)
library(tsibble)
library(feasts)
library(ggcorrplot)
library(glmnet)
library(caret)
library(rattle)

set.seed(1234)

2 Import Data

Hourly solar power plan production data is imported from csv file.

df <- read_csv("production_data_with_weather.csv") %>% select(-timestamp)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Date = col_date(format = ""),
##   Hour = col_double(),
##   Production = col_double(),
##   timestamp = col_datetime(format = ""),
##   CLOUD_LOW_LAYER_37.75_34.25 = col_double(),
##   CLOUD_LOW_LAYER_37.75_34.5 = col_double(),
##   CLOUD_LOW_LAYER_38_34.25 = col_double(),
##   CLOUD_LOW_LAYER_38_34.5 = col_double(),
##   DSWRF_37.75_34.25 = col_double(),
##   DSWRF_37.75_34.5 = col_double(),
##   DSWRF_38_34.25 = col_double(),
##   DSWRF_38_34.5 = col_double(),
##   TEMP_37.75_34.25 = col_double(),
##   TEMP_37.75_34.5 = col_double(),
##   TEMP_38_34.25 = col_double(),
##   TEMP_38_34.5 = col_double()
## )
head(df)
## # A tibble: 6 x 15
##   Date        Hour Production CLOUD_LOW_LAYER… CLOUD_LOW_LAYER… CLOUD_LOW_LAYER…
##   <date>     <dbl>      <dbl>            <dbl>            <dbl>            <dbl>
## 1 2019-10-09     0          0                0                0                0
## 2 2019-10-09     1          0                0                0                0
## 3 2019-10-09     2          0                0                0                0
## 4 2019-10-09     3          0                0                0                0
## 5 2019-10-09     4          0                0                0                0
## 6 2019-10-09     5          0                0                0                0
## # … with 9 more variables: CLOUD_LOW_LAYER_38_34.5 <dbl>,
## #   DSWRF_37.75_34.25 <dbl>, DSWRF_37.75_34.5 <dbl>, DSWRF_38_34.25 <dbl>,
## #   DSWRF_38_34.5 <dbl>, TEMP_37.75_34.25 <dbl>, TEMP_37.75_34.5 <dbl>,
## #   TEMP_38_34.25 <dbl>, TEMP_38_34.5 <dbl>

3 Descriptive Analysis

Summary statistics of variables are shown.

skim(df)
Data summary
Name df
Number of rows 11592
Number of columns 15
_______________________
Column type frequency:
Date 1
numeric 14
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
Date 0 1 2019-10-09 2021-02-02 2020-06-06 483

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Hour 0 1 11.50 6.92 0.00 5.75 11.50 17.25 23.00 ▇▇▆▇▇
Production 0 1 6.20 9.04 0.00 0.00 0.00 11.80 30.00 ▇▁▁▁▁
CLOUD_LOW_LAYER_37.75_34.25 0 1 14.74 30.05 0.00 0.00 0.00 6.00 100.00 ▇▁▁▁▁
CLOUD_LOW_LAYER_37.75_34.5 0 1 15.45 30.36 0.00 0.00 0.00 10.00 100.00 ▇▁▁▁▁
CLOUD_LOW_LAYER_38_34.25 0 1 16.17 31.48 0.00 0.00 0.00 11.00 100.00 ▇▁▁▁▁
CLOUD_LOW_LAYER_38_34.5 0 1 16.33 31.30 0.00 0.00 0.00 13.00 100.00 ▇▁▁▁▁
DSWRF_37.75_34.25 0 1 209.32 268.02 0.00 0.00 70.00 380.00 950.00 ▇▂▂▁▁
DSWRF_37.75_34.5 0 1 208.64 267.75 0.00 0.00 70.00 380.00 950.00 ▇▂▂▁▁
DSWRF_38_34.25 0 1 208.81 268.09 0.00 0.00 60.00 380.00 950.00 ▇▂▂▁▁
DSWRF_38_34.5 0 1 207.94 267.06 0.00 0.00 64.00 372.75 952.00 ▇▂▂▁▁
TEMP_37.75_34.25 0 1 12.97 9.11 -10.28 5.86 12.22 19.38 38.95 ▁▇▇▅▁
TEMP_37.75_34.5 0 1 12.04 9.15 -12.31 4.90 11.26 18.41 37.65 ▁▇▇▅▂
TEMP_38_34.25 0 1 10.98 9.06 -12.34 3.87 10.22 17.39 36.25 ▁▇▇▅▂
TEMP_38_34.5 0 1 8.99 9.05 -15.47 1.75 8.15 15.55 33.65 ▁▇▇▅▂

Checking for which hours of the day there are production

plot(df$Hour, df$Production, xlab = "Hour", ylab = "Production", main = "Hourly Production Plot")

Filtering out the hours without production (Hour < 5 & Hour > 19) from the data

df2 <- df %>% filter(Hour >= 5 & Hour <= 19)
head(df2, 20)
## # A tibble: 20 x 15
##    Date        Hour Production CLOUD_LOW_LAYER_37.75_34… CLOUD_LOW_LAYER_37.75_…
##    <date>     <dbl>      <dbl>                     <dbl>                   <dbl>
##  1 2019-10-09     5       0                            0                       0
##  2 2019-10-09     6       0.04                         0                       0
##  3 2019-10-09     7       3.7                          0                       0
##  4 2019-10-09     8      11.2                          0                       0
##  5 2019-10-09     9      19.8                          0                       0
##  6 2019-10-09    10      24.9                          0                       0
##  7 2019-10-09    11      25.7                          0                       0
##  8 2019-10-09    12      25.7                          0                       0
##  9 2019-10-09    13      25.7                          0                       0
## 10 2019-10-09    14      24                            0                       0
## 11 2019-10-09    15      18.3                          0                       0
## 12 2019-10-09    16      12.3                          0                       0
## 13 2019-10-09    17       4.27                         0                       0
## 14 2019-10-09    18       0.08                         0                       0
## 15 2019-10-09    19       0                            0                       0
## 16 2019-10-10     5       0                            0                       0
## 17 2019-10-10     6       0.04                         0                       0
## 18 2019-10-10     7       3.63                         0                       0
## 19 2019-10-10     8      12.5                          0                       0
## 20 2019-10-10     9      19.9                          0                       0
## # … with 10 more variables: CLOUD_LOW_LAYER_38_34.25 <dbl>,
## #   CLOUD_LOW_LAYER_38_34.5 <dbl>, DSWRF_37.75_34.25 <dbl>,
## #   DSWRF_37.75_34.5 <dbl>, DSWRF_38_34.25 <dbl>, DSWRF_38_34.5 <dbl>,
## #   TEMP_37.75_34.25 <dbl>, TEMP_37.75_34.5 <dbl>, TEMP_38_34.25 <dbl>,
## #   TEMP_38_34.5 <dbl>

Scatter plot, variable distribution and correlation table for target and potential independent variables. We can say that the same variables with different coordinates are highly correlated each other.

GGally::ggpairs(df %>% select(-Date, -Hour))
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

As the same type of variables with different coordinates are so similar to each other in numeric manner and highly correlated, we are calculating their average values to reduce them into one variable. This is done because when modeling with time series, there should not be highly correlated variables in the model.

df3 <- df2 %>% 
  mutate(CLOUD_LOW_LAYER_AVG = (CLOUD_LOW_LAYER_37.75_34.25 + CLOUD_LOW_LAYER_37.75_34.5 + CLOUD_LOW_LAYER_38_34.25 + CLOUD_LOW_LAYER_38_34.5) / 4,
         DSWRF_AVG = (DSWRF_37.75_34.25 + DSWRF_37.75_34.5 + DSWRF_38_34.25 + DSWRF_38_34.5) / 4,
         TEMP_AVG =  (TEMP_37.75_34.25 + TEMP_37.75_34.5 + TEMP_38_34.25 + TEMP_38_34.5) / 4) %>% 
  select(Date, Hour, Production, CLOUD_LOW_LAYER_AVG, DSWRF_AVG, TEMP_AVG)
head(df3)
## # A tibble: 6 x 6
##   Date        Hour Production CLOUD_LOW_LAYER_AVG DSWRF_AVG TEMP_AVG
##   <date>     <dbl>      <dbl>               <dbl>     <dbl>    <dbl>
## 1 2019-10-09     5       0                      0       0       12.7
## 2 2019-10-09     6       0.04                   0       0       12.4
## 3 2019-10-09     7       3.7                    0       0       12.2
## 4 2019-10-09     8      11.2                    0      20       15.1
## 5 2019-10-09     9      19.8                    0      62.8     16.7
## 6 2019-10-09    10      24.9                    0     475       17.9

Summary statistics of reduced data

skim(df3)
Data summary
Name df3
Number of rows 7245
Number of columns 6
_______________________
Column type frequency:
Date 1
numeric 5
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
Date 0 1 2019-10-09 2021-02-02 2020-06-06 483

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Hour 0 1 12.00 4.32 5.00 8.00 12.00 16.00 19.00 ▇▇▇▇▇
Production 0 1 9.92 9.69 0.00 0.15 6.69 19.87 30.00 ▇▂▂▃▂
CLOUD_LOW_LAYER_AVG 0 1 15.61 29.30 0.00 0.00 0.00 14.50 100.00 ▇▁▁▁▁
DSWRF_AVG 0 1 310.43 285.05 0.00 20.00 260.00 525.00 945.00 ▇▃▃▂▂
TEMP_AVG 0 1 12.50 9.63 -12.02 4.94 11.42 19.96 36.63 ▁▇▇▆▂

Histograms of target and independent variables

ggplot(df3, aes(x=Production)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df3, aes(x=TEMP_AVG)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df3, aes(x=DSWRF_AVG)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df3, aes(x=CLOUD_LOW_LAYER_AVG)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplots of target and independent variables

par(mfrow=c(2,2))
boxplot(df3$Production, main = "Production")
boxplot(df3$TEMP_AVG, main = "Temperature")
boxplot(df3$DSWRF_AVG, main = "DSWRF")
boxplot(df3$CLOUD_LOW_LAYER_AVG, main = "Cloud Low Layer")

par(mfrow=c(1,1))

Scatter plot of Cloud Low Layer vs Production values is shown below. We can observe that higher cloud low layer values causes lower frequency for high production values.

plot(df3$CLOUD_LOW_LAYER_AVG, df3$Production, xlab = "Cloud Low Layer", ylab = "Production", main = "Cloud Low Layer-Production Scatter Plot")

Scatter plot of DSWRF vs Production values is shown below. We can say that DSWRF values are directly proportional to the Production values. As the DSWRF values increases, the Production values also increase.

plot(df3$DSWRF_AVG, df3$Production, xlab = "DSWRF", ylab = "Production", main = "DSWRF-Production Scatter Plot")

Scatter plot of Hour vs DSWRF values is shown below. There is a relationship between DSWRF values and hours similar to the relation of Production-Hour

plot(df3$Hour, df3$DSWRF_AVG , xlab = "Hour", ylab = "DSWRF", main = "Hour-DSWRF Scatter Plot")

Scatter plot of Hour vs Temperature values is shown below. As it is expected, temperature are higher at afternoon hours and lower at morning and evening hours

plot(df3$Hour, df3$TEMP_AVG, xlab = "Hour", ylab = "Temperature", main = "Hour-Temperature Scatter Plot")

Scatter plot of DSWRF vs Temperature values is shown below. It is obvious that low temperature values correspond to low DSWRF values, while the higher DSWRF values, the slightly higher the temperature values.

plot(df3$DSWRF_AVG, df3$TEMP_AVG , xlab = "DSWRF", ylab = "Temperature", main = "DSWRF-Temperature Scatter Plot")

Line plot of Production as a time series for each hour is shown below. We can observe that there is seasonal effect on the production for morning and evening hours as the day hours are shifting along the year.

df3 %>%
  ggplot(aes(x = Date, y = Production)) + 
  geom_line(aes(color = as.factor(Hour)), size = 0.9, show.legend = F) + 
  labs(title="Line plot of Production for each hour", y="Production Amount", x="Date") + 
  facet_wrap(~Hour, ncol=4) + 
  theme_tq()

Line plot of Temperature as a time series for each hour is shown below. We can also observe the same seasonal effect on the temperature variable. Temperature is low at winter and autumn months, while it is higher in summer and spring months (as expected).

df3 %>%
  ggplot(aes(x = Date, y = TEMP_AVG)) + 
  geom_line(aes(color = as.factor(Hour)), size = 0.9, show.legend = F) + 
  labs(title="Line plot of Temperature for each hour", y="Temperature", x="Date") + 
  facet_wrap(~Hour, ncol=4) + 
  theme_tq()

Line plot of DSWRF as a time series for each hour is shown below. We can see the same seasonal effect with temperature variable. However, DSWRF values are almost zero in morning hours.

df3 %>%
  ggplot(aes(x = Date, y = DSWRF_AVG)) + 
  geom_line(aes(color = as.factor(Hour)), size = 0.9, show.legend = F) + 
  labs(title="Line plot of DSWRF for each hour", y="DSWRF", x="Date") + 
  facet_wrap(~Hour, ncol=4) + 
  theme_tq()

Line plot of Cloud Low Layer as a time series for each hour is shown below. We can state that there are very few cloudy days at summer months (as expected).

df3 %>%
  ggplot(aes(x = Date, y = CLOUD_LOW_LAYER_AVG)) + 
  geom_line(aes(color = as.factor(Hour)), size = 0.9, show.legend = F) + 
  labs(title="Line plot of Cloud Low Layer for each hour", y="Cloud Low Layer", x="Date") + 
  facet_wrap(~Hour, ncol=4) + 
  theme_tq()

Finding the largest mean production level for the whole data to use that hour for parameter tuning and use the same parameters for other hours. Hour 11 or 12 can be used for parameter tuning.

df3 %>% group_by(Hour) %>%  summarise(mean_production = mean(Production)) %>% arrange(desc(mean_production))
## # A tibble: 15 x 2
##     Hour mean_production
##    <dbl>           <dbl>
##  1    11         20.4   
##  2    12         20.4   
##  3    13         19.5   
##  4    10         18.3   
##  5    14         17.7   
##  6    15         14.5   
##  7     9         13.6   
##  8    16          9.20  
##  9     8          7.41  
## 10    17          3.95  
## 11     7          2.37  
## 12    18          0.945 
## 13     6          0.359 
## 14    19          0.0939
## 15     5          0.0364

4 Data Manipulation

New variables like lagged variables are created. Daily (lag15) and weekly (lag105) lagged variables created from both target and independent variables.

df4 <- df3 %>% 
  mutate(Production_lag24 = lag(Production, 15),
         TEMP_AVG_lag24 = lag(TEMP_AVG, 15),
         DSWRF_AVG_lag24 = lag(DSWRF_AVG, 15),
         CLOUD_LOW_LAYER_AVG_lag24 = lag(CLOUD_LOW_LAYER_AVG, 15),
         Production_lag105 = lag(Production, 105),
         TEMP_AVG_lag105 = lag(TEMP_AVG, 105),
         DSWRF_AVG_lag105 = lag(DSWRF_AVG, 105),
         CLOUD_LOW_LAYER_AVG_lag105 = lag(CLOUD_LOW_LAYER_AVG, 105)) %>% 
  filter(Date >= as_date('2019-10-16'))
head(df4)
## # A tibble: 6 x 14
##   Date        Hour Production CLOUD_LOW_LAYER_AVG DSWRF_AVG TEMP_AVG
##   <date>     <dbl>      <dbl>               <dbl>     <dbl>    <dbl>
## 1 2019-10-16     5       0                      0       0       15.2
## 2 2019-10-16     6       0                      0       0       14.8
## 3 2019-10-16     7       0.18                   0       0       14.9
## 4 2019-10-16     8       5.14                   0      10       17.7
## 5 2019-10-16     9      18.6                    0      34.8     19.8
## 6 2019-10-16    10      23.9                    0     388.      21.4
## # … with 8 more variables: Production_lag24 <dbl>, TEMP_AVG_lag24 <dbl>,
## #   DSWRF_AVG_lag24 <dbl>, CLOUD_LOW_LAYER_AVG_lag24 <dbl>,
## #   Production_lag105 <dbl>, TEMP_AVG_lag105 <dbl>, DSWRF_AVG_lag105 <dbl>,
## #   CLOUD_LOW_LAYER_AVG_lag105 <dbl>

Train and test data sets are created

df4_train <- df4 %>% filter(Date < as_date('2020-12-01')) 
df4_test <- df4 %>% filter(Date >= as_date('2020-12-01') & Date <= as_date('2021-01-31'))

Splitting train data into hourly format to have different models for every hour

df4_train_h5 <- df4_train %>% filter(Hour == 5)
df4_train_h6 <- df4_train %>% filter(Hour == 6)
df4_train_h7 <- df4_train %>% filter(Hour == 7)
df4_train_h8 <- df4_train %>% filter(Hour == 8)
df4_train_h9 <- df4_train %>% filter(Hour == 9)
df4_train_h10 <- df4_train %>% filter(Hour == 10)
df4_train_h11 <- df4_train %>% filter(Hour == 11)
df4_train_h12 <- df4_train %>% filter(Hour == 12)
df4_train_h13 <- df4_train %>% filter(Hour == 13)
df4_train_h14 <- df4_train %>% filter(Hour == 14)
df4_train_h15 <- df4_train %>% filter(Hour == 15)
df4_train_h16 <- df4_train %>% filter(Hour == 16)
df4_train_h17 <- df4_train %>% filter(Hour == 17)
df4_train_h18 <- df4_train %>% filter(Hour == 18)
df4_train_h19 <- df4_train %>% filter(Hour == 19)

Splitting test data into hourly format to have different models for every hour

df4_test_h5 <- df4_test %>% filter(Hour == 5)
df4_test_h6 <- df4_test %>% filter(Hour == 6)
df4_test_h7 <- df4_test %>% filter(Hour == 7)
df4_test_h8 <- df4_test %>% filter(Hour == 8)
df4_test_h9 <- df4_test %>% filter(Hour == 9)
df4_test_h10 <- df4_test %>% filter(Hour == 10)
df4_test_h11 <- df4_test %>% filter(Hour == 11)
df4_test_h12 <- df4_test %>% filter(Hour == 12)
df4_test_h13 <- df4_test %>% filter(Hour == 13)
df4_test_h14 <- df4_test %>% filter(Hour == 14)
df4_test_h15 <- df4_test %>% filter(Hour == 15)
df4_test_h16 <- df4_test %>% filter(Hour == 16)
df4_test_h17 <- df4_test %>% filter(Hour == 17)
df4_test_h18 <- df4_test %>% filter(Hour == 18)
df4_test_h19 <- df4_test %>% filter(Hour == 19)

5 Modeling

Different models such as linear regression, decision tree, random forest, GLMNET are tried for data set of Hour 11 to make the parameter tuning on the data.

fitControl <- trainControl(method = "repeatedcv",
                           number = 5,
                           repeats = 5)

Linear Regression model with repeated cross validation (fitControl) is created by using caret package (“rpart” method). According to results, intercept parameter is tuned and set to TRUE.

lin_reg_h11 <- train(Production ~ ., 
                 data = df4_train_h11 %>% select(-Date, -Hour),
                 method = "lm",
                 trControl = fitControl,
                 tuneLength = 5)
lin_reg_h11
## Linear Regression 
## 
## 412 samples
##  11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 330, 329, 329, 330, 330, 330, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE     
##   5.22662  0.5116809  3.609332
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lin_reg_h11)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.8165  -1.6674   0.7105   2.6709  18.4428 
## 
## Coefficients:
##                             Estimate Std. Error t value             Pr(>|t|)
## (Intercept)                14.041508   1.456740   9.639 < 0.0000000000000002
## CLOUD_LOW_LAYER_AVG        -0.068715   0.015076  -4.558        0.00000687140
## DSWRF_AVG                   0.026823   0.004497   5.965        0.00000000539
## TEMP_AVG                    0.004738   0.112456   0.042               0.9664
## Production_lag24            0.209047   0.049474   4.225        0.00002957204
## TEMP_AVG_lag24              0.018438   0.109055   0.169               0.8658
## DSWRF_AVG_lag24            -0.011216   0.004544  -2.468               0.0140
## CLOUD_LOW_LAYER_AVG_lag24  -0.027582   0.015864  -1.739               0.0829
## Production_lag105          -0.036981   0.048980  -0.755               0.4507
## TEMP_AVG_lag105             0.007737   0.057408   0.135               0.8929
## DSWRF_AVG_lag105           -0.007804   0.004473  -1.745               0.0818
## CLOUD_LOW_LAYER_AVG_lag105 -0.017055   0.015458  -1.103               0.2705
##                               
## (Intercept)                ***
## CLOUD_LOW_LAYER_AVG        ***
## DSWRF_AVG                  ***
## TEMP_AVG                      
## Production_lag24           ***
## TEMP_AVG_lag24                
## DSWRF_AVG_lag24            *  
## CLOUD_LOW_LAYER_AVG_lag24  .  
## Production_lag105             
## TEMP_AVG_lag105               
## DSWRF_AVG_lag105           .  
## CLOUD_LOW_LAYER_AVG_lag105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.087 on 400 degrees of freedom
## Multiple R-squared:  0.5422, Adjusted R-squared:  0.5296 
## F-statistic: 43.06 on 11 and 400 DF,  p-value: < 0.00000000000000022
lin_reg_h11$finalModel
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Coefficients:
##                (Intercept)         CLOUD_LOW_LAYER_AVG  
##                  14.041508                   -0.068715  
##                  DSWRF_AVG                    TEMP_AVG  
##                   0.026823                    0.004738  
##           Production_lag24              TEMP_AVG_lag24  
##                   0.209047                    0.018438  
##            DSWRF_AVG_lag24   CLOUD_LOW_LAYER_AVG_lag24  
##                  -0.011216                   -0.027582  
##          Production_lag105             TEMP_AVG_lag105  
##                  -0.036981                    0.007737  
##           DSWRF_AVG_lag105  CLOUD_LOW_LAYER_AVG_lag105  
##                  -0.007804                   -0.017055

Decision Tree model with repeated cross validation (fitControl) is created by using caret package (“rpart” method). According to results, cp parameter is tuned and set to 0.01831669.

dec_tree_h11 <- train(Production ~ ., 
                  data = df4_train_h11 %>% select(-Date, -Hour),
                  method = "rpart",
                  trControl = fitControl,
                  tuneLength = 5)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
dec_tree_h11
## CART 
## 
## 412 samples
##  11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 330, 331, 330, 328, 329, 331, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   MAE     
##   0.01831669  5.502001  0.4651920  3.643034
##   0.02327174  5.510581  0.4597672  3.714958
##   0.05404548  5.567895  0.4432877  3.838971
##   0.10169783  5.905240  0.3742533  4.127225
##   0.39236226  6.703265  0.2895707  4.972411
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01831669.
trellis.par.set(caretTheme())
plot(dec_tree_h11)

fancyRpartPlot(dec_tree_h11$finalModel)

dec_tree_h11$finalModel
## n= 412 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 412 22608.7000 20.513300  
##    2) CLOUD_LOW_LAYER_AVG>=26.375 70  3718.8940 10.256860  
##      4) DSWRF_AVG< 251.25 35   830.3398  6.078857 *
##      5) DSWRF_AVG>=251.25 35  1666.6560 14.434860 *
##    3) CLOUD_LOW_LAYER_AVG< 26.375 342 10019.0000 22.612570  
##      6) DSWRF_AVG< 363.75 62  3572.6530 17.102420  
##       12) DSWRF_AVG< 232.5 8   209.0406  9.973750 *
##       13) DSWRF_AVG>=232.5 54  2896.8410 18.158520  
##         26) DSWRF_AVG_lag105>=301.25 23  1136.4540 14.335650 *
##         27) DSWRF_AVG_lag105< 301.25 31  1174.8720 20.994840 *
##      7) DSWRF_AVG>=363.75 280  4147.0950 23.832680 *

Random Forest model with repeated cross validation (fitControl) is created by using caret package (“ranger” method). Impurity measure is used as an importance criteria and number of trees parameter is selected as 50 because of performance issues. According to results, mtry parameter is tuned and set to 6, splitrule parameter is tuned and set to “variance”, min.node.size parameter is tuned and set to 5.

rand_forest_h11 <- train(Production ~ ., 
                       data = df4_train_h11 %>% select(-Date, -Hour),
                       method = "ranger",
                       trControl = fitControl,
                       num.trees = 50,
                       importance = "impurity")
rand_forest_h11
## Random Forest 
## 
## 412 samples
##  11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 332, 329, 329, 329, 329, 329, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE      Rsquared   MAE     
##    2    variance    5.264371  0.4989451  3.616634
##    2    extratrees  5.286603  0.4983441  3.685986
##    6    variance    5.227255  0.5068756  3.521374
##    6    extratrees  5.230724  0.5063643  3.545468
##   11    variance    5.296689  0.4943498  3.526574
##   11    extratrees  5.231504  0.5057880  3.536130
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 6, splitrule = variance
##  and min.node.size = 5.
plot(rand_forest_h11)

rand_forest_h11$finalModel
## Ranger result
## 
## Call:
##  ranger::ranger(dependent.variable.name = ".outcome", data = x,      mtry = min(param$mtry, ncol(x)), min.node.size = param$min.node.size,      splitrule = as.character(param$splitrule), write.forest = TRUE,      probability = classProbs, ...) 
## 
## Type:                             Regression 
## Number of trees:                  50 
## Sample size:                      412 
## Number of independent variables:  11 
## Mtry:                             6 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       29.22577 
## R squared (OOB):                  0.4687092

GLMNET model with repeated cross validation (fitControl) is created by using caret package (“glmnet” method). According to results, alpha parameter is tuned and set to 0.1, lambda parameter is tuned and set to 0.09524498.

glmnet_h11 <- train(Production ~ ., 
                         data = df4_train_h11 %>% select(-Date, -Hour),
                         method = "glmnet",
                         trControl = fitControl,
                         tuneLenght= 5)
glmnet_h11
## glmnet 
## 
## 412 samples
##  11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 328, 329, 330, 330, 331, 330, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda       RMSE      Rsquared   MAE     
##   0.10   0.009524498  5.167968  0.5197991  3.575222
##   0.10   0.095244980  5.156209  0.5211426  3.580631
##   0.10   0.952449803  5.181813  0.5178728  3.686135
##   0.55   0.009524498  5.164148  0.5204057  3.573684
##   0.55   0.095244980  5.154017  0.5211951  3.591338
##   0.55   0.952449803  5.199747  0.5231104  3.745337
##   1.00   0.009524498  5.160886  0.5209047  3.572558
##   1.00   0.095244980  5.161451  0.5197672  3.613407
##   1.00   0.952449803  5.270544  0.5227863  3.854134
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.09524498.
plot(glmnet_h11)

Candidate models for Hour 11 are compared below. It is seen that Random Forest model have almost same R-squared value with other models, however slightly lower (so better) MAE and RMSE values. As a result of these analysis, Random Forest model is selected.

results = resamples(list(Linear_Regression = lin_reg_h11, Decision_Tree = dec_tree_h11, Random_Forest = rand_forest_h11, GLMNET = glmnet_h11))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 2.893254 3.455664 3.628979 3.609332 3.745557 4.304403    0
## Decision_Tree     2.917881 3.369264 3.632743 3.643034 3.844598 4.609441    0
## Random_Forest     2.801116 3.281684 3.476505 3.521374 3.801898 4.137494    0
## GLMNET            2.988826 3.413598 3.559016 3.591338 3.760475 4.150317    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 3.840532 5.026626 5.328700 5.226620 5.629565 6.165916    0
## Decision_Tree     4.405733 5.147553 5.415286 5.502001 5.842309 6.806200    0
## Random_Forest     4.414741 4.713916 5.296077 5.227255 5.641607 6.249548    0
## GLMNET            4.064381 4.720794 5.158619 5.154017 5.442912 6.105409    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.3289091 0.4744179 0.4973486 0.5116809 0.5380840 0.6539699
## Decision_Tree     0.2435324 0.4030340 0.4816605 0.4651920 0.5199176 0.6486876
## Random_Forest     0.3512146 0.4489992 0.4965389 0.5068756 0.5720000 0.6818772
## GLMNET            0.3403778 0.4619531 0.5320585 0.5211951 0.5818343 0.6937941
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Modeling each hour with tuned parameters of all four models to see which one is the best model for different hours. Models for each hour and their comparison results are shown below.

Hour 5

lin_reg_h5 <- train(Production ~ ., 
                    data = df4_train_h5 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
dec_tree_h5 <- train(Production ~ ., 
                     data = df4_train_h5 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h5 <- train(Production ~ ., 
                        data = df4_train_h5 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h5 <- train(Production ~ ., 
                   data = df4_train_h5 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h5, Decision_Tree = dec_tree_h5, Random_Forest = rand_forest_h5, GLMNET = glmnet_h5))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                         Min.    1st Qu.     Median       Mean    3rd Qu.
## Linear_Regression 0.06109701 0.07149871 0.16213142 0.19377222 0.26224125
## Decision_Tree     0.02534893 0.04190023 0.05595834 0.07307654 0.08807493
## Random_Forest     0.01172859 0.04033275 0.05104760 0.07127874 0.08871822
## GLMNET            0.04697540 0.05254211 0.05658586 0.12308514 0.17762195
##                        Max. NA's
## Linear_Regression 0.7202957    0
## Decision_Tree     0.1492681    0
## Random_Forest     0.1426512    0
## GLMNET            0.3413626    0
## 
## RMSE 
##                         Min.    1st Qu.     Median      Mean   3rd Qu.     Max.
## Linear_Regression 0.08099045 0.08565357 1.21231884 1.2301644 1.8363075 5.391001
## Decision_Tree     0.08707441 0.14571653 0.18631673 0.3857114 0.2864111 1.201284
## Random_Forest     0.03166954 0.16942030 0.30825572 0.4457357 0.4347880 1.194677
## GLMNET            0.05790149 0.06623457 0.09284883 0.7194918 1.2165981 2.253347
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
## 
## Rsquared 
##                           Min.    1st Qu.     Median       Mean   3rd Qu.
## Linear_Regression 0.0003331962 0.01634230 0.02856765 0.07717957 0.1430698
## Decision_Tree     0.0733450666 0.14425400 0.20204982 0.28841956 0.5049438
## Random_Forest     0.0132893289 0.16998356 0.28523050 0.28870035 0.3953266
## GLMNET            0.0004256890 0.02797673 0.07236170 0.09339992 0.1486044
##                        Max. NA's
## Linear_Regression 0.2788443    0
## Decision_Tree     0.5842820    0
## Random_Forest     0.5738456    0
## GLMNET            0.2478791    0
bwplot(results)

Hour 6

lin_reg_h6 <- train(Production ~ ., 
                    data = df4_train_h6 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h6 <- train(Production ~ ., 
                     data = df4_train_h6 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h6 <- train(Production ~ ., 
                        data = df4_train_h6 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h6 <- train(Production ~ ., 
                   data = df4_train_h6 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h6, Decision_Tree = dec_tree_h6, Random_Forest = rand_forest_h6, GLMNET = glmnet_h6))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                         Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.16940398 0.2001016 0.2429741 0.2683884 0.3530160 0.4884026
## Decision_Tree     0.09787429 0.1530611 0.1679170 0.1851597 0.2106784 0.3095159
## Random_Forest     0.08531213 0.1116778 0.1395333 0.1526156 0.1973566 0.2670701
## GLMNET            0.16498923 0.2152129 0.2348562 0.2587064 0.2950568 0.3900661
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
## 
## RMSE 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.     Max.
## Linear_Regression 0.2194157 0.2922248 1.3590824 0.9674428 1.5107835 2.091764
## Decision_Tree     0.1692675 0.2804642 0.3465727 0.5427619 0.4729900 1.423948
## Random_Forest     0.1862462 0.2833799 0.3993544 0.5717088 0.6353569 1.412374
## GLMNET            0.2251155 0.2942282 1.1294281 0.8548998 1.3440668 1.506428
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.0508196 0.2050972 0.2909114 0.4762651 0.7565689 0.8407262
## Decision_Tree     0.2501168 0.6493123 0.7455029 0.6614932 0.8008137 0.9035438
## Random_Forest     0.2499238 0.5635486 0.7105843 0.6449164 0.8186044 0.8887055
## GLMNET            0.1342418 0.2406220 0.3251768 0.4760765 0.7457675 0.8705448
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 7

lin_reg_h7 <- train(Production ~ ., 
                    data = df4_train_h7 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h7 <- train(Production ~ ., 
                     data = df4_train_h7 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h7 <- train(Production ~ ., 
                        data = df4_train_h7 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h7 <- train(Production ~ ., 
                   data = df4_train_h7 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h7, Decision_Tree = dec_tree_h7, Random_Forest = rand_forest_h7, GLMNET = glmnet_h7))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.4971225 0.5596436 0.6225765 0.6345757 0.6548182 0.9677510
## Decision_Tree     0.5983998 0.6627996 0.6976009 0.6959152 0.7296611 0.8363205
## Random_Forest     0.3547400 0.4756874 0.5163212 0.5147180 0.5727098 0.6586124
## GLMNET            0.5215829 0.5953632 0.6677701 0.6681238 0.7344533 0.8144906
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
## 
## RMSE 
##                        Min.   1st Qu.    Median      Mean  3rd Qu.     Max.
## Linear_Regression 0.6691931 0.8429159 0.9569816 1.0283014 1.241142 1.811826
## Decision_Tree     0.7966954 0.9049267 1.0554327 1.0610764 1.167829 1.526170
## Random_Forest     0.5654006 0.7940049 0.9017547 0.9620457 1.105421 1.476581
## GLMNET            0.6974994 0.8209556 0.9923746 1.0467719 1.298232 1.507808
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.5664890 0.8018250 0.8690763 0.8350145 0.8928561 0.9238590
## Decision_Tree     0.6737419 0.7931297 0.8298058 0.8290319 0.8747315 0.8991564
## Random_Forest     0.6978782 0.8016444 0.8745980 0.8563503 0.9047156 0.9593668
## GLMNET            0.6714714 0.7511151 0.8444750 0.8280671 0.9034652 0.9345106
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 8

lin_reg_h8 <- train(Production ~ ., 
                    data = df4_train_h8 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h8 <- train(Production ~ ., 
                     data = df4_train_h8 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h8 <- train(Production ~ ., 
                        data = df4_train_h8 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h8 <- train(Production ~ ., 
                   data = df4_train_h8 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h8, Decision_Tree = dec_tree_h8, Random_Forest = rand_forest_h8, GLMNET = glmnet_h8))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 1.386200 1.466004 1.595995 1.591276 1.677737 1.855410    0
## Decision_Tree     1.423172 1.706762 1.789893 1.819229 1.945229 2.086366    0
## Random_Forest     1.125330 1.437559 1.516934 1.493616 1.569949 1.760493    0
## GLMNET            1.370608 1.488966 1.646669 1.641676 1.795324 1.902613    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 1.802077 1.969944 2.203501 2.219020 2.491602 2.807579    0
## Decision_Tree     2.013645 2.247497 2.497295 2.509986 2.757508 3.057554    0
## Random_Forest     1.639538 2.059600 2.250837 2.193052 2.357741 2.637183    0
## GLMNET            1.809695 2.018951 2.208819 2.252863 2.496700 2.837130    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.7115203 0.7910899 0.8262558 0.8181951 0.8534859 0.8831579
## Decision_Tree     0.6723375 0.7418487 0.7695122 0.7685163 0.7993603 0.8472942
## Random_Forest     0.7522255 0.7888502 0.8180590 0.8218847 0.8514367 0.9045943
## GLMNET            0.7302920 0.7736447 0.8144849 0.8119234 0.8492779 0.8745783
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 9

lin_reg_h9 <- train(Production ~ ., 
                    data = df4_train_h9 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h9 <- train(Production ~ ., 
                     data = df4_train_h9 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h9 <- train(Production ~ ., 
                        data = df4_train_h9 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h9 <- train(Production ~ ., 
                   data = df4_train_h9 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h9, Decision_Tree = dec_tree_h9, Random_Forest = rand_forest_h9, GLMNET = glmnet_h9))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 2.289454 2.511619 2.646099 2.624025 2.715956 2.965946    0
## Decision_Tree     2.215729 2.546226 2.727988 2.734191 2.969713 3.200974    0
## Random_Forest     2.157627 2.439685 2.531193 2.534053 2.633166 2.999513    0
## GLMNET            2.227191 2.409536 2.625687 2.634654 2.834732 3.149887    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 3.142513 3.517448 3.693221 3.653569 3.841738 4.168773    0
## Decision_Tree     3.208230 3.613147 3.975658 3.928788 4.164211 4.821493    0
## Random_Forest     2.933008 3.440202 3.710938 3.679170 3.975709 4.479455    0
## GLMNET            2.850025 3.264149 3.659375 3.637146 3.909733 4.536640    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.5756786 0.6887956 0.7094638 0.7116834 0.7357064 0.7993492
## Decision_Tree     0.4937582 0.6255691 0.6524466 0.6606406 0.7276637 0.7760153
## Random_Forest     0.5641566 0.6495491 0.7127404 0.7028668 0.7559808 0.8243018
## GLMNET            0.5331377 0.6528997 0.7249066 0.7088563 0.7641697 0.8492393
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 10

lin_reg_h10 <- train(Production ~ ., 
                    data = df4_train_h10 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h10 <- train(Production ~ ., 
                     data = df4_train_h10 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h10 <- train(Production ~ ., 
                        data = df4_train_h10 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h10 <- train(Production ~ ., 
                   data = df4_train_h10 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h10, Decision_Tree = dec_tree_h10, Random_Forest = rand_forest_h10, GLMNET = glmnet_h10))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 2.827019 3.138701 3.436751 3.399906 3.610517 4.000010    0
## Decision_Tree     3.114844 3.390721 3.458618 3.523573 3.667151 4.039803    0
## Random_Forest     2.736721 3.123714 3.236087 3.228411 3.331059 3.774466    0
## GLMNET            2.651210 3.045346 3.413147 3.397033 3.658127 4.185256    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 3.845368 4.300051 4.686413 4.684854 5.188596 5.386741    0
## Decision_Tree     4.336040 4.931528 5.136224 5.230371 5.436961 6.127723    0
## Random_Forest     4.023837 4.500532 4.759400 4.775794 5.060122 5.703028    0
## GLMNET            3.799048 4.144273 4.722191 4.648664 4.915853 5.984849    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.4781749 0.5340666 0.6129745 0.6030435 0.6737420 0.7105188
## Decision_Tree     0.3394893 0.4704252 0.5486128 0.5239339 0.5648775 0.6604553
## Random_Forest     0.4852588 0.5571177 0.5851373 0.5911706 0.6407157 0.7318905
## GLMNET            0.3404036 0.5657334 0.6091553 0.6081792 0.6876935 0.7521348
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 11

lin_reg_h11 <- train(Production ~ ., 
                    data = df4_train_h11 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h11 <- train(Production ~ ., 
                     data = df4_train_h11 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h11 <- train(Production ~ ., 
                        data = df4_train_h11 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h11 <- train(Production ~ ., 
                   data = df4_train_h11 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h11, Decision_Tree = dec_tree_h11, Random_Forest = rand_forest_h11, GLMNET = glmnet_h11))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 3.117203 3.378431 3.527296 3.588332 3.649869 4.598174    0
## Decision_Tree     2.909311 3.229981 3.634277 3.597204 3.881523 4.405803    0
## Random_Forest     2.849971 3.288536 3.337324 3.471296 3.742786 4.355053    0
## GLMNET            3.207660 3.449037 3.539508 3.585157 3.739453 4.286480    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 4.325927 4.744098 5.064928 5.177447 5.338782 6.762147    0
## Decision_Tree     4.210169 5.097261 5.397524 5.421982 5.866497 6.429012    0
## Random_Forest     4.178219 4.770878 5.232375 5.164570 5.491026 6.805026    0
## GLMNET            4.317378 4.846473 5.227204 5.173494 5.460170 6.171554    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.2349625 0.4877636 0.5429064 0.5190425 0.5895693 0.6442228
## Decision_Tree     0.2939168 0.3957522 0.4956905 0.4807416 0.5472796 0.7180968
## Random_Forest     0.1775609 0.4675963 0.5175464 0.5193753 0.6141767 0.6775467
## GLMNET            0.3696828 0.4665804 0.5147169 0.5169379 0.5512434 0.6587083
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 12

lin_reg_h12 <- train(Production ~ ., 
                    data = df4_train_h12 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h12 <- train(Production ~ ., 
                     data = df4_train_h12 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h12 <- train(Production ~ ., 
                        data = df4_train_h12 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h12 <- train(Production ~ ., 
                   data = df4_train_h12 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h12, Decision_Tree = dec_tree_h12, Random_Forest = rand_forest_h12, GLMNET = glmnet_h12))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 3.066959 3.421513 3.573793 3.590925 3.753643 4.278373    0
## Decision_Tree     3.182917 3.513272 3.687557 3.704190 3.905639 4.024236    0
## Random_Forest     2.931777 3.247847 3.460217 3.544897 3.767698 4.501917    0
## GLMNET            3.098569 3.364207 3.483382 3.560061 3.800194 3.980713    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 4.242096 4.752972 5.039184 5.115155 5.374922 6.109485    0
## Decision_Tree     4.663445 5.303133 5.590856 5.538428 5.820182 6.131801    0
## Random_Forest     4.281506 4.809154 5.157451 5.236048 5.514498 6.674796    0
## GLMNET            4.367972 4.681305 5.073839 5.078893 5.497301 5.943480    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.3297208 0.4845615 0.5336605 0.5229874 0.5556144 0.6425681
## Decision_Tree     0.2426765 0.4040453 0.4407777 0.4485278 0.5057943 0.5839952
## Random_Forest     0.3030567 0.4142810 0.4938516 0.4928226 0.5760130 0.6557259
## GLMNET            0.3902284 0.4530634 0.5272406 0.5217724 0.5782865 0.6452469
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 13

lin_reg_h13 <- train(Production ~ ., 
                    data = df4_train_h13 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h13 <- train(Production ~ ., 
                     data = df4_train_h13 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h13 <- train(Production ~ ., 
                        data = df4_train_h13 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h13 <- train(Production ~ ., 
                   data = df4_train_h13 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h13, Decision_Tree = dec_tree_h13, Random_Forest = rand_forest_h13, GLMNET = glmnet_h13))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 3.283446 3.678389 3.854194 3.819304 4.005245 4.315248    0
## Decision_Tree     3.378292 3.709032 3.837238 3.910158 4.081288 4.448233    0
## Random_Forest     3.059001 3.526965 3.732022 3.739911 3.981713 4.350794    0
## GLMNET            3.294857 3.660850 3.845855 3.859275 4.086487 4.569363    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 4.686241 5.098131 5.357375 5.412314 5.650593 6.294016    0
## Decision_Tree     5.034172 5.618371 5.761708 5.825576 6.089539 6.707069    0
## Random_Forest     4.489901 5.210717 5.339412 5.468987 5.856344 6.248504    0
## GLMNET            4.493264 5.117803 5.374365 5.438886 5.709759 6.830564    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.3217198 0.4605431 0.5014228 0.4990038 0.5406080 0.6077315
## Decision_Tree     0.1864731 0.3852106 0.4155047 0.4191339 0.4672401 0.5790096
## Random_Forest     0.2846852 0.4355046 0.4765385 0.4825645 0.5367031 0.6348703
## GLMNET            0.2227228 0.4136758 0.5234453 0.4922653 0.5511015 0.6426111
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 14

lin_reg_h14 <- train(Production ~ ., 
                    data = df4_train_h14 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h14 <- train(Production ~ ., 
                     data = df4_train_h14 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h14 <- train(Production ~ ., 
                        data = df4_train_h14 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h14 <- train(Production ~ ., 
                   data = df4_train_h14 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h14, Decision_Tree = dec_tree_h14, Random_Forest = rand_forest_h14, GLMNET = glmnet_h14))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 3.508180 3.924293 4.047812 4.106579 4.339893 4.949461    0
## Decision_Tree     3.130630 3.748720 4.029870 4.086679 4.472295 5.105601    0
## Random_Forest     3.308367 3.616677 3.917309 3.912651 4.099767 4.569188    0
## GLMNET            3.458218 3.937169 4.066736 4.112458 4.402946 4.685981    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 4.705944 5.319636 5.617469 5.596154 5.920488 6.417937    0
## Decision_Tree     4.321025 5.394666 5.688121 5.784243 6.321231 7.268571    0
## Random_Forest     4.780873 5.091839 5.435449 5.450370 5.752966 6.220710    0
## GLMNET            4.534066 5.239388 5.482169 5.567657 5.966554 6.290298    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.2935323 0.3866280 0.4467493 0.4492857 0.5079710 0.5937375
## Decision_Tree     0.1723364 0.3594850 0.4487820 0.4165149 0.4802196 0.6322364
## Random_Forest     0.2765493 0.4379721 0.4707517 0.4695387 0.5184459 0.5960957
## GLMNET            0.2886252 0.4006892 0.4418233 0.4465131 0.5126977 0.5849430
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 15

lin_reg_h15 <- train(Production ~ ., 
                    data = df4_train_h15 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h15 <- train(Production ~ ., 
                     data = df4_train_h15 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h15 <- train(Production ~ ., 
                        data = df4_train_h15 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h15 <- train(Production ~ ., 
                   data = df4_train_h15 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h15, Decision_Tree = dec_tree_h15, Random_Forest = rand_forest_h15, GLMNET = glmnet_h15))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 3.014716 3.304269 3.718817 3.610721 3.787755 4.325422    0
## Decision_Tree     2.775503 3.367507 3.592634 3.674717 3.964539 4.348776    0
## Random_Forest     2.655412 3.269812 3.444362 3.446101 3.623921 3.995736    0
## GLMNET            3.013965 3.358319 3.597367 3.600768 3.918883 4.123056    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 4.213644 4.677034 5.041977 4.955536 5.359908 5.552394    0
## Decision_Tree     4.056718 4.749213 5.171054 5.196175 5.659125 6.364327    0
## Random_Forest     3.621872 4.613620 4.871145 4.865931 5.225792 5.727652    0
## GLMNET            4.107591 4.515084 4.892711 4.922951 5.362670 5.667085    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.3638357 0.4605392 0.4939939 0.5067243 0.5407407 0.6597904
## Decision_Tree     0.3053825 0.3864035 0.4864143 0.4664758 0.5240788 0.6637746
## Random_Forest     0.3714000 0.4607723 0.5025885 0.5229420 0.5848027 0.6823357
## GLMNET            0.3500360 0.4597862 0.5012187 0.5112695 0.5752844 0.6349804
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 16

lin_reg_h16 <- train(Production ~ ., 
                    data = df4_train_h16 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h16 <- train(Production ~ ., 
                     data = df4_train_h16 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h16 <- train(Production ~ ., 
                        data = df4_train_h16 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h16 <- train(Production ~ ., 
                   data = df4_train_h16 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h16, Decision_Tree = dec_tree_h16, Random_Forest = rand_forest_h16, GLMNET = glmnet_h16))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 1.995822 2.419684 2.573412 2.559386 2.707568 3.005746    0
## Decision_Tree     2.126803 2.418821 2.603387 2.604103 2.788904 3.029354    0
## Random_Forest     2.038063 2.255853 2.401145 2.438605 2.614763 2.903027    0
## GLMNET            2.006677 2.395276 2.543759 2.534565 2.712171 2.972613    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 2.826198 3.225076 3.436756 3.437818 3.706063 3.946918    0
## Decision_Tree     2.955545 3.359719 3.492810 3.542564 3.832865 4.329442    0
## Random_Forest     2.854902 3.150525 3.339039 3.381359 3.667249 3.977621    0
## GLMNET            2.732919 3.215857 3.449467 3.413480 3.600293 3.974472    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.5528480 0.6289683 0.6632161 0.6643475 0.7024180 0.7815984
## Decision_Tree     0.5000898 0.5988085 0.6596745 0.6404629 0.6795656 0.7541845
## Random_Forest     0.5534123 0.6195876 0.6967533 0.6725992 0.7128051 0.7760320
## GLMNET            0.5469153 0.6370736 0.6684323 0.6676564 0.7013827 0.7893644
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 17

lin_reg_h17 <- train(Production ~ ., 
                    data = df4_train_h17 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h17 <- train(Production ~ ., 
                     data = df4_train_h17 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h17 <- train(Production ~ ., 
                        data = df4_train_h17 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h17 <- train(Production ~ ., 
                   data = df4_train_h17 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h17, Decision_Tree = dec_tree_h17, Random_Forest = rand_forest_h17, GLMNET = glmnet_h17))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 1.075375 1.272980 1.361687 1.349141 1.453807 1.689689    0
## Decision_Tree     1.232630 1.400354 1.497297 1.493019 1.611589 1.711721    0
## Random_Forest     1.081176 1.192849 1.303859 1.310273 1.423572 1.545252    0
## GLMNET            1.094692 1.225841 1.396720 1.356017 1.440666 1.678798    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear_Regression 1.556822 1.870488 2.048242 2.025076 2.179897 2.623589    0
## Decision_Tree     1.682365 2.079345 2.284816 2.228391 2.420291 2.697800    0
## Random_Forest     1.621155 1.851115 2.096050 2.062672 2.247884 2.406862    0
## GLMNET            1.634215 1.797872 2.048909 2.013617 2.136650 2.433957    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.6169692 0.7242731 0.7503258 0.7618300 0.7916721 0.8704356
## Decision_Tree     0.6183650 0.6657735 0.7005302 0.7111222 0.7550302 0.8306167
## Random_Forest     0.6666678 0.7204831 0.7433597 0.7535570 0.7991615 0.8413293
## GLMNET            0.6540482 0.7357092 0.7545931 0.7638747 0.8098491 0.8575323
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 18

lin_reg_h18 <- train(Production ~ ., 
                    data = df4_train_h18 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h18 <- train(Production ~ ., 
                     data = df4_train_h18 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h18 <- train(Production ~ ., 
                        data = df4_train_h18 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h18 <- train(Production ~ ., 
                   data = df4_train_h18 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h18, Decision_Tree = dec_tree_h18, Random_Forest = rand_forest_h18, GLMNET = glmnet_h18))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.3849398 0.4384068 0.4608148 0.4638871 0.4868914 0.5690345
## Decision_Tree     0.3409060 0.4155545 0.4368887 0.4557123 0.4949975 0.6299430
## Random_Forest     0.2465634 0.3312156 0.3483535 0.3628731 0.3904498 0.5107207
## GLMNET            0.3866043 0.4389573 0.4631879 0.4643945 0.4866759 0.5406999
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
## 
## RMSE 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.5683505 0.6480250 0.7070179 0.7145571 0.7677800 0.8857574
## Decision_Tree     0.5669806 0.7002014 0.7505882 0.7832582 0.8673409 1.0859657
## Random_Forest     0.4755392 0.6367056 0.6798621 0.6887588 0.7391388 0.9532257
## GLMNET            0.5564430 0.6447261 0.7202300 0.7040448 0.7554512 0.8456627
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.6514716 0.7455537 0.7683856 0.7680803 0.8137536 0.8563431
## Decision_Tree     0.5543311 0.6708407 0.7266006 0.7199871 0.7776893 0.8606548
## Random_Forest     0.6362266 0.7480361 0.8078318 0.7854434 0.8192655 0.8890177
## GLMNET            0.6617434 0.7375800 0.7703730 0.7733320 0.8269172 0.8513065
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

Hour 19

lin_reg_h19 <- train(Production ~ ., 
                    data = df4_train_h19 %>% select(-Date, -Hour),
                    method = "lm",
                    trControl = fitControl,
                    tuneGrid = expand.grid(intercept = T),
                    tuneLength = 5)

dec_tree_h19 <- train(Production ~ ., 
                     data = df4_train_h19 %>% select(-Date, -Hour),
                     method = "rpart",
                     trControl = fitControl,
                     tuneGrid = expand.grid(cp = 0.01831669),
                     tuneLength = 5)

rand_forest_h19 <- train(Production ~ ., 
                        data = df4_train_h19 %>% select(-Date, -Hour),
                        method = "ranger",
                        trControl = fitControl,
                        num.trees = 50,
                        tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
                        importance = "impurity")

glmnet_h19 <- train(Production ~ ., 
                   data = df4_train_h19 %>% select(-Date, -Hour),
                   method = "glmnet",
                   trControl = fitControl,
                   tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
                   tuneLenght= 5)

results = resamples(list(Linear_Regression = lin_reg_h19, Decision_Tree = dec_tree_h19, Random_Forest = rand_forest_h19, GLMNET = glmnet_h19))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET 
## Number of resamples: 25 
## 
## MAE 
##                         Min.    1st Qu.     Median      Mean   3rd Qu.
## Linear_Regression 0.11377618 0.13393312 0.14801816 0.1513910 0.1729970
## Decision_Tree     0.08023985 0.10338146 0.11422675 0.1232364 0.1301680
## Random_Forest     0.07622747 0.09059199 0.09913574 0.1045346 0.1135062
## GLMNET            0.10461614 0.12224866 0.13046102 0.1364589 0.1539398
##                        Max. NA's
## Linear_Regression 0.1959930    0
## Decision_Tree     0.2212080    0
## Random_Forest     0.1515046    0
## GLMNET            0.1789626    0
## 
## RMSE 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.1535314 0.1841391 0.2256384 0.2901539 0.3428669 0.5888926
## Decision_Tree     0.1781888 0.2233287 0.2621926 0.3143660 0.3232364 0.6067362
## Random_Forest     0.1689211 0.1928223 0.2303002 0.2832471 0.2849836 0.5778092
## GLMNET            0.1452817 0.1850964 0.2338504 0.2874187 0.3001882 0.5518154
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
## 
## Rsquared 
##                         Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Linear_Regression 0.06116251 0.1482205 0.2706640 0.2388334 0.3034867 0.3949668
## Decision_Tree     0.01650860 0.1092602 0.1639303 0.1872464 0.2333479 0.6517671
## Random_Forest     0.09916785 0.1838025 0.2741495 0.2900228 0.3910371 0.5452759
## GLMNET            0.07586490 0.1812867 0.2792879 0.2570271 0.3281074 0.4014769
##                   NA's
## Linear_Regression    0
## Decision_Tree        0
## Random_Forest        0
## GLMNET               0
bwplot(results)

6 Prediction for the Test Period

As Random Forest model is generally the best one according to the model comparison results among hourly models, we choose to continue with Random Forest model to make the predictions on the test set. Below the predictions are done.

rf_h5_pred_test <- predict(rand_forest_h5$finalModel, data = df4_test_h5)
rf_h6_pred_test <- predict(rand_forest_h6$finalModel, data = df4_test_h6)
rf_h7_pred_test <- predict(rand_forest_h7$finalModel, data = df4_test_h7)
rf_h8_pred_test <- predict(rand_forest_h8$finalModel, data = df4_test_h8)
rf_h9_pred_test <- predict(rand_forest_h9$finalModel, data = df4_test_h9)
rf_h10_pred_test <- predict(rand_forest_h10$finalModel, data = df4_test_h10)
rf_h11_pred_test <- predict(rand_forest_h11$finalModel, data = df4_test_h11)
rf_h12_pred_test <- predict(rand_forest_h12$finalModel, data = df4_test_h12)
rf_h13_pred_test <- predict(rand_forest_h13$finalModel, data = df4_test_h13)
rf_h14_pred_test <- predict(rand_forest_h14$finalModel, data = df4_test_h14)
rf_h15_pred_test <- predict(rand_forest_h15$finalModel, data = df4_test_h15)
rf_h16_pred_test <- predict(rand_forest_h16$finalModel, data = df4_test_h16)
rf_h17_pred_test <- predict(rand_forest_h17$finalModel, data = df4_test_h17)
rf_h18_pred_test <- predict(rand_forest_h18$finalModel, data = df4_test_h18)
rf_h19_pred_test <- predict(rand_forest_h19$finalModel, data = df4_test_h19)

Test set predictions are binded with test data set for each hour.

df4_test_h5_w_pred <- df4_test_h5 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h5_pred_test$predictions)
df4_test_h6_w_pred <- df4_test_h6 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h6_pred_test$predictions)
df4_test_h7_w_pred <- df4_test_h7 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h7_pred_test$predictions)
df4_test_h8_w_pred <- df4_test_h8 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h8_pred_test$predictions)
df4_test_h9_w_pred <- df4_test_h9 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h9_pred_test$predictions)
df4_test_h10_w_pred <- df4_test_h10 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h10_pred_test$predictions)
df4_test_h11_w_pred <- df4_test_h11 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h11_pred_test$predictions)
df4_test_h12_w_pred <- df4_test_h12 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h12_pred_test$predictions)
df4_test_h13_w_pred <- df4_test_h13 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h13_pred_test$predictions)
df4_test_h14_w_pred <- df4_test_h14 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h14_pred_test$predictions)
df4_test_h15_w_pred <- df4_test_h15 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h15_pred_test$predictions)
df4_test_h16_w_pred <- df4_test_h16 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h16_pred_test$predictions)
df4_test_h17_w_pred <- df4_test_h17 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h17_pred_test$predictions)
df4_test_h18_w_pred <- df4_test_h18 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h18_pred_test$predictions)
df4_test_h19_w_pred <- df4_test_h19 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h19_pred_test$predictions)

Test sets for each hour with predictions are row binded. Thus, we create one test set with prediction for whole data.

df4_test_w_pred <- rbind(df4_test_h5_w_pred, df4_test_h6_w_pred, df4_test_h7_w_pred, df4_test_h8_w_pred,
                         df4_test_h9_w_pred, df4_test_h10_w_pred, df4_test_h11_w_pred, df4_test_h12_w_pred, 
                         df4_test_h13_w_pred, df4_test_h14_w_pred, df4_test_h15_w_pred, df4_test_h16_w_pred,
                         df4_test_h17_w_pred, df4_test_h18_w_pred, df4_test_h19_w_pred) %>% arrange(Date, Hour)
head(df4_test_w_pred)
## # A tibble: 6 x 4
##   Date        Hour Production Prediction
##   <date>     <dbl>      <dbl>      <dbl>
## 1 2020-12-01     5       0        0     
## 2 2020-12-01     6       0        0     
## 3 2020-12-01     7       0        0.0478
## 4 2020-12-01     8       1.51     2.91  
## 5 2020-12-01     9       5.29     5.67  
## 6 2020-12-01    10      19.1     10.5
skim(df4_test_w_pred %>% select(Production, Prediction))
Data summary
Name df4_test_w_pred %>% selec…
Number of rows 930
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Production 0 1 7.61 9.21 0 0.00 2.60 14.42 30.00 ▇▂▂▂▁
Prediction 0 1 6.20 6.75 0 0.01 2.97 11.23 23.08 ▇▂▂▂▁

Prediction and residual analysis for test set of random forest model is shown. Actual vs Predicted plot shows that predicted values are generally close to the actual values. There are few observations which are too much deviated. Histogram of residuals shows normal distribution around mean value of 0 which is good for the model. Finally, predicted vs residuals plot shows that predicted values are distributed around 0 which is also good for the model even if there some few deviated values, they are not so important and not so much.

plot(df4_test_w_pred$Prediction, df4_test_w_pred$Production, xlab = "Predicted", ylab = "Actual", main = "Actual vs Predicted Plot for Random Forest Model with Test Set")
abline(a=0,b=1,col='red', lty = 2)

rf_residuals_test <- df4_test_w_pred$Production - df4_test_w_pred$Prediction
hist(rf_residuals_test, xlab = "Residuals", main = "Residuals Histogram of Random Forest Model")

plot(df4_test_w_pred$Prediction, rf_residuals_test, xlab = "Predicted", ylab = "Residuals", main = "Predicted vs Residuals Plot for Random Forest Model with Test Set")
abline(h = 0, col = "red", lty = 2)

Lastly, RMSE values of test set predictions are shown. It is not a high value which is also good for the model.

RMSE(df4_test_w_pred$Production, df4_test_w_pred$Prediction)
## [1] 4.666542